setwd('~/Desktop/PhD/Others/BMS-BigData-Course-2025/data-analysis-2025/')Bulk RNA-seq data analysis - CD4/CD8 stimulation - Big Data course 2025
This markdown has the results for the analysis of the data used in the practice for the Big Data course 2025. Done by Nathan.
Initial set-up
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(stringr)
library(patchwork)
library(edgeR)
library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
library(ReactomePA)
library(viridis)
library(ggrepel)
library(purrr)
library(biomaRt)
library(pheatmap)
library(RColorBrewer)
library(reshape2)
library(variancePartition)
library(forcats)# Define some functions
default_theme <- function(){
theme_bw() +
theme(panel.grid = element_blank(),
panel.border = element_rect(linewidth = 1),
axis.title = element_text(size = rel(1)),
axis.title.y.left = element_text(margin = margin(r = 10, unit = 'pt')),
axis.title.y.right = element_text(margin = margin(l = 10, unit = 'pt')),
axis.title.x.bottom = element_text(margin = margin(t = 10, unit = 'pt')),
plot.title = element_text(size = rel(1), hjust = 0.5, face = 'bold'),
axis.text = element_text(size = rel(1)),
axis.ticks.length = unit(5, 'pt'),
strip.background = element_rect(fill = "#f5a9a9", linewidth = 0),
strip.text = element_text(size = rel(1), color = "black")
)
}
plotPC <- function(pca_obj, coldata, coord_1, coord_2, color_by) {
var_explained <- pca$sdev^2 / sum(pca$sdev^2)
var_explained_df <- tibble(PC = paste0('PC', seq_along(var_explained)),
Var_Explained = var_explained)
df <- as.data.frame(cbind(coldata, pca$x))
x_label <- paste0(coord_1, ' (', round(var_explained_df$Var_Explained[var_explained_df$PC == coord_1]*100, digits = 2), '%)')
y_label <- paste0(coord_2, ' (', round(var_explained_df$Var_Explained[var_explained_df$PC == coord_2]*100, digits = 2), '%)')
p <- ggplot(df, aes(x = !!sym(coord_1), y = !!sym(coord_2), color = as.factor(!!sym(color_by)))) +
geom_point(size = 3) +
scale_color_viridis_d() +
labs(x = x_label, y = y_label) +
default_theme()
return(p)
}
# The next function we define is to create a scree plot
# Makes a Scree plot based on the PCA results, if n_pcs = NULL, plots all available PCs
plotScree <- function(pca, n_pcs = NULL){
if (is.null(n_pcs)){
n_pcs <- ncol(pca$x)
}
# Extract variance explained
var_explained <- pca$sdev^2 / sum(pca$sdev^2)
cumulative_var <- cumsum(var_explained)
# Create a data frame for plotting
plot_data <- data.frame(
PC = seq_along(var_explained),
Variance_Explained = var_explained,
Cumulative_Explained = cumulative_var
)
plot_data <- plot_data[1:n_pcs,]
# Plot
ggplot(plot_data, aes(x = factor(PC))) +
geom_bar(aes(y = Variance_Explained), stat = "identity", fill = "#22A884FF") +
geom_line(aes(y = Cumulative_Explained, group = 1), color = "#2A788EFF", linewidth = 1) +
geom_point(aes(y = Cumulative_Explained), color = "#2A788EFF", size = 2) +
scale_y_continuous(
name = "Fraction of Variance Explained",
sec.axis = sec_axis(~., name = "Cumulative Explained Variance"),
expand = c(0,0)
) +
labs(
title = "Scree Plot",
x = "Principal Component",
y = "Fraction of Variance"
) +
default_theme()
}Load the dataset
counts <- read.table('data/counts.tsv')
metadata <- read.table('data/metadata.tsv')
# Check sample names match
all(rownames(metadata) %in% colnames(counts))[1] TRUE
all(rownames(metadata) == colnames(counts))[1] TRUE
# Add extra information in the metadata
extra_metadata <- tibble(
sample = c('R0083', 'A0325', 'A0469', 'A0509', 'A0506', 'R0082'),
condition = c('CeD', 'Control', 'Control', 'CeD', 'Control', 'CeD'),
sex = c('F', 'F', 'M', 'F', 'M', 'F'),
age = c(16, 14, 14, 18, 17, 13)
)
# Change some names for clarity
metadata <- metadata |>
mutate(
celltype = if_else(celltype == 'CD4T', 'CD4', 'CD8'),
stimulation = case_when(
stimulation == 'beadstim' & celltype == 'CD4' ~ 'CD3_CD28_stimulation',
stimulation == 'unstim' & celltype == 'CD4' ~ 'unstimulated',
stimulation == 'stim' & celltype == 'CD8' ~ 'stimulated_CD4_supernatant',
stimulation == 'unstim' & celltype == 'CD8' ~ 'unstimulated_CD4_supernatant',
stimulation == 'SAB' & celltype == 'CD8' ~ 'basal_media'
)
)
metadata <- left_join(metadata, extra_metadata, by = 'sample')
metadata$full_sample <- paste(metadata$celltype, metadata$stimulation, metadata$condition, sep = ".")
rownames(metadata) <- colnames(counts)Quality control and exploratory analysis
y <- DGEList(counts = counts,
samples = metadata,
genes = rownames(counts))Remove poorly expressed genes
Here I am using counts > 1000 as cutoff
# Histogram of log(CPM)
cpm <- cpm(y, log = TRUE)
hist(cpm)
# With the following lines we add arbitrary cut-offs to histogram
abline(v = log10(100), col = "blue",lwd = 2)
abline(v = log10(500), col = "purple",lwd = 2)
abline(v = log10(1000), col = "red",lwd = 2)keep <- filterByExpr(y,
group = y$samples$group, # considers the group when filtering - optional
min.total.count = 1000) # the number you decided based on the histogram above
y_filtered <- y[keep, , keep.lib.sizes=FALSE]
print(paste0("Number of genes before filtering: ", dim(y)[1]))[1] "Number of genes before filtering: 22334"
print(paste0("Number of genes after filtering: ", dim(y_filtered)[1]))[1] "Number of genes after filtering: 3482"
Principal component analysis (PCA)
cpm <- cpm(y_filtered, log = TRUE)
pca <- prcomp(t(cpm), scale. = TRUE)plotScree(pca, n_pcs = 20)# Plot the first 2 PCs colored by the condition
p1 <- plotPC(pca,
coldata = y_filtered$samples,
coord_1 = 'PC1',
coord_2 = 'PC2',
color_by = 'stimulation')
p2 <- plotPC(pca,
coldata = y_filtered$samples,
coord_1 = 'PC1',
coord_2 = 'PC2',
color_by = 'celltype')
p3 <- plotPC(pca,
coldata = y_filtered$samples,
coord_1 = 'PC1',
coord_2 = 'PC2',
color_by = 'condition')
print(p1 + p2 + p3)p4 <- plotPC(pca,
coldata = y_filtered$samples,
coord_1 = 'PC1',
coord_2 = 'PC2',
color_by = 'sex')
p5 <- plotPC(pca,
coldata = y_filtered$samples,
coord_1 = 'PC1',
coord_2 = 'PC2',
color_by = 'age')
p6 <- plotPC(pca,
coldata = y_filtered$samples,
coord_1 = 'PC1',
coord_2 = 'PC2',
color_by = 'sample')
print(p4+p5+p6)Normalization of read counts
y_filtered <- calcNormFactors(y_filtered)Variance partition analysis
Alternatively to the PCA, you can use a “Variance Partition analysis” to quantify and interpret the sources of biological and technical variation in your data. This uses the package variancePartition. This uses the TMM normalized counts and a formula with the variables from your metadata you want to check (more on models and formulas will be explained in future steps). You can investigate any variables you want at once here, instead of one by one as you would do in the PCA.
# First we assess correlation between all pairs of variables
form <- ~ condition + celltype + age + sex + stimulation + lib.size
C <- canCorPairs(form, y_filtered$samples)Warning in canCorPairs(form, y_filtered$samples): Regression model may be problematic.
High colinearity between variables:
celltype and stimulation
plotCorrMatrix(C)Now we are going to fit a model in the gene expression data to check how much each variable contributes to the variation, you can choose which variables to include based on the previous plot.
design <- model.matrix(~condition, y_filtered$samples)
v <- voom(y_filtered, design)
# We need to scale some variables so they are all in the same scale and comparable
y_filtered$samples$age <- scale(y_filtered$samples$age)
y_filtered$samples$lib.size_scaled <- scale(y_filtered$samples$lib.size)
# Categorical variables need to be specified with (1|variable)
form <- ~ (1|condition) + (1|sex) + (1|stimulation) + age + lib.size_scaled
varPart <- fitExtractVarPartModel(v, form, y_filtered$samples)
plotVarPart(varPart)Differential expression analysis
Defining design matrix
# Here it is also important that the categorial variables are factors and the numerical variables are on the same scale
y_filtered$samples$sex <- factor(y_filtered$samples$sex)
y_filtered$samples$full_sample <- factor(y_filtered$samples$full_sample)
# The parameter data in model.matrix() is your metadata (y$samples)
design <- model.matrix(~ 0 + full_sample + sex + age, data = y_filtered$samples)
# Cleaning names for clarirty
colnames(design) <- str_replace_all(colnames(design), "full_sample", "")
knitr::kable(head(design))| CD4.CD3_CD28_stimulation.CeD | CD4.CD3_CD28_stimulation.Control | CD4.unstimulated.CeD | CD4.unstimulated.Control | CD8.basal_media.CeD | CD8.basal_media.Control | CD8.stimulated_CD4_supernatant.CeD | CD8.stimulated_CD4_supernatant.Control | CD8.unstimulated_CD4_supernatant.CeD | CD8.unstimulated_CD4_supernatant.Control | sexM | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X10_R0083_CD4T_bead_stim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.3651484 |
| X11_A0325_CD4T_bead_stim | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -0.7302967 |
| X12_A0469_CD4T_bead_stim | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -0.7302967 |
| X13_A0509_CD8_stim | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1.4605935 |
| X14_A0506_CD8_stim | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0.9128709 |
| X15_R0082_CD8_stim | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | -1.2780193 |
Contrasts matrix for CD4 T cells.
contrasts_matrix <- makeContrasts(
# Effect of CD3/CD28 stimulation on CD4 control
CD4_stim_control = CD4.CD3_CD28_stimulation.Control - CD4.unstimulated.Control,
# Effect of CD3/CD28 stimulation on CD4 CeD
CD4_stim_CeD = CD4.CD3_CD28_stimulation.CeD - CD4.unstimulated.CeD,
# Effect of CD3/CD28 stimulation on CD4 (combined)
CD4_stim_combined = (CD4.CD3_CD28_stimulation.Control + CD4.CD3_CD28_stimulation.CeD)/2 - (CD4.unstimulated.Control + CD4.unstimulated.CeD)/2,
# What is the difference between CD4 CeD stimulated and CD4 Control stimulated?
Diff_CD4_stim = CD4.CD3_CD28_stimulation.CeD - CD4.CD3_CD28_stimulation.Control,
# What is the difference between CD4 CeD stimulated and CD4 Control stimulated?
Diff_CD4_unstim = CD4.unstimulated.CeD - CD4.unstimulated.Control,
levels = colnames(design)
)
knitr::kable(contrasts_matrix)| CD4_stim_control | CD4_stim_CeD | CD4_stim_combined | Diff_CD4_stim | Diff_CD4_unstim | |
|---|---|---|---|---|---|
| CD4.CD3_CD28_stimulation.CeD | 0 | 1 | 0.5 | 1 | 0 |
| CD4.CD3_CD28_stimulation.Control | 1 | 0 | 0.5 | -1 | 0 |
| CD4.unstimulated.CeD | 0 | -1 | -0.5 | 0 | 1 |
| CD4.unstimulated.Control | -1 | 0 | -0.5 | 0 | -1 |
| CD8.basal_media.CeD | 0 | 0 | 0.0 | 0 | 0 |
| CD8.basal_media.Control | 0 | 0 | 0.0 | 0 | 0 |
| CD8.stimulated_CD4_supernatant.CeD | 0 | 0 | 0.0 | 0 | 0 |
| CD8.stimulated_CD4_supernatant.Control | 0 | 0 | 0.0 | 0 | 0 |
| CD8.unstimulated_CD4_supernatant.CeD | 0 | 0 | 0.0 | 0 | 0 |
| CD8.unstimulated_CD4_supernatant.Control | 0 | 0 | 0.0 | 0 | 0 |
| sexM | 0 | 0 | 0.0 | 0 | 0 |
| age | 0 | 0 | 0.0 | 0 | 0 |
Contrasts matrix for CD8 T cells.
contrasts_matrix_cd8 <- makeContrasts(
# Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect
CD8_stim_combined = (CD8.stimulated_CD4_supernatant.CeD + CD8.stimulated_CD4_supernatant.Control)/2 - (CD8.basal_media.CeD + CD8.basal_media.Control)/2,
# Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect
CD8_unstim_combined = (CD8.unstimulated_CD4_supernatant.CeD + CD8.unstimulated_CD4_supernatant.Control)/2 - (CD8.basal_media.CeD + CD8.basal_media.Control)/2,
# Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Global effect
CD8_stim_vs_unstim_combined = (CD8.stimulated_CD4_supernatant.CeD + CD8.stimulated_CD4_supernatant.Control)/2 - (CD8.unstimulated_CD4_supernatant.CeD + CD8.unstimulated_CD4_supernatant.Control)/2,
# Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls
CD8_stim_control = CD8.stimulated_CD4_supernatant.Control - CD8.basal_media.Control,
# Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls
CD8_unstim_control = CD8.unstimulated_CD4_supernatant.Control - CD8.basal_media.Control,
# Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Controls
CD8_stim_vs_unstim_control = CD8.stimulated_CD4_supernatant.Control - CD8.unstimulated_CD4_supernatant.Control,
# Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD
CD8_stim_CeD = CD8.stimulated_CD4_supernatant.CeD - CD8.basal_media.CeD,
# Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD
CD8_unstim_CeD = CD8.unstimulated_CD4_supernatant.CeD - CD8.basal_media.CeD,
# Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - CeD
CD8_stim_vs_unstim_CeD = CD8.stimulated_CD4_supernatant.CeD - CD8.unstimulated_CD4_supernatant.CeD,
# What is the difference between CD8 CeD + stimulated supernatant and control?
Diff_CD8_stim = CD8.stimulated_CD4_supernatant.CeD - CD8.stimulated_CD4_supernatant.Control,
# What is the difference between CD8 CeD + unstimulated supernatant and control?
Diff_CD8_unstim = CD8.stimulated_CD4_supernatant.CeD - CD8.stimulated_CD4_supernatant.Control,
levels = colnames(design)
)
knitr::kable(contrasts_matrix_cd8)| CD8_stim_combined | CD8_unstim_combined | CD8_stim_vs_unstim_combined | CD8_stim_control | CD8_unstim_control | CD8_stim_vs_unstim_control | CD8_stim_CeD | CD8_unstim_CeD | CD8_stim_vs_unstim_CeD | Diff_CD8_stim | Diff_CD8_unstim | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| CD4.CD3_CD28_stimulation.CeD | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| CD4.CD3_CD28_stimulation.Control | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| CD4.unstimulated.CeD | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| CD4.unstimulated.Control | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| CD8.basal_media.CeD | -0.5 | -0.5 | 0.0 | 0 | 0 | 0 | -1 | -1 | 0 | 0 | 0 |
| CD8.basal_media.Control | -0.5 | -0.5 | 0.0 | -1 | -1 | 0 | 0 | 0 | 0 | 0 | 0 |
| CD8.stimulated_CD4_supernatant.CeD | 0.5 | 0.0 | 0.5 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| CD8.stimulated_CD4_supernatant.Control | 0.5 | 0.0 | 0.5 | 1 | 0 | 1 | 0 | 0 | 0 | -1 | -1 |
| CD8.unstimulated_CD4_supernatant.CeD | 0.0 | 0.5 | -0.5 | 0 | 0 | 0 | 0 | 1 | -1 | 0 | 0 |
| CD8.unstimulated_CD4_supernatant.Control | 0.0 | 0.5 | -0.5 | 0 | 1 | -1 | 0 | 0 | 0 | 0 | 0 |
| sexM | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| age | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Fit the model and find DEGs
v <- voom(y_filtered, design, plot=TRUE) # change to FALSE if you don't want the plot# Fit the model
fit <- lmFit(v, design)
res_all_cd4 <- list()
for (x in colnames(contrasts_matrix)) {
contr <- contrasts_matrix[, x]
fit2 <- contrasts.fit(fit, contr)
fit2 <- eBayes(fit2)
res <- topTable(fit2, sort.by = "P", n = Inf)
res$Contrast <- x
res_all_cd4 <- append(res_all_cd4, list(res))
}
results_cd4 <- list_rbind(res_all_cd4)
res_all_cd8 <- list()
for (x in colnames(contrasts_matrix_cd8)) {
contr <- contrasts_matrix_cd8[, x]
fit2 <- contrasts.fit(fit, contr)
fit2 <- eBayes(fit2)
res <- topTable(fit2, sort.by = "P", n = Inf)
res$Contrast <- x
res_all_cd8 <- append(res_all_cd8, list(res))
}
results_cd8 <- list_rbind(res_all_cd8)This list have the genes with their Ensembl ID, to help with interpretation we will add the gene symbol and entrezid annotation before we continue.
mart <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
genemap <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", "entrezgene_id"),
filters = "ensembl_gene_id",
values = unique(results_cd4$genes),
mart = mart)
results_cd4 <- left_join(results_cd4, genemap, by = join_by("genes" == "ensembl_gene_id"), relationship = 'many-to-many')
genemap <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", "entrezgene_id"),
filters = "ensembl_gene_id",
values = unique(results_cd8$genes),
mart = mart)
results_cd8 <- left_join(results_cd8, genemap, by = join_by("genes" == "ensembl_gene_id"), relationship = 'many-to-many')Saving the results. Considering significant genes with |log2FC| > 1 and adj. p-value < 0.05.
results_significant_cd4 <- results_cd4 |>
dplyr::filter(abs(logFC) > 1 & adj.P.Val < 0.05)
results_significant_cd8 <- results_cd8 |>
dplyr::filter(abs(logFC) > 1 & adj.P.Val < 0.05)
# Save the results
# Specify the path to the folder you want to save the results
folder_path <- "results"
# Check if the folder exists
if (!dir.exists(folder_path)) {
# If it doesn't exist, create it
dir.create(folder_path)
cat(paste0("Folder '", folder_path, "' created successfully!\n"))
} else {
cat(paste0("Folder '", folder_path, "' already exists.\n"))
}Folder 'results' already exists.
write.csv(results_cd4, paste0(folder_path, "/DEG_results_CD4T_cells.csv"), row.names=F)
write.csv(results_significant_cd4, paste0(folder_path, "/DEG_results_CD4T_cells_significant_only.csv"), row.names=F)
write.csv(results_cd8, paste0(folder_path, "/DEG_results_CD8T_cells.csv"), row.names=F)
write.csv(results_significant_cd8, paste0(folder_path, "/DEG_results_CD8T_cells_significant_only.csv"), row.names=F)# Define a function to make a volcano plot
# Makes a volcano plot, highlighting the genes above the specified log2FC threshold and labels the top n genes (based on padj)
VolcanoPlot <- function(res, threshold = 0.5, p_cutoff = 0.05, n_labels = 15, title = '') {
# Prepare the data
res_plot <- res |>
mutate(DE = case_when(
adj.P.Val < p_cutoff & logFC >= threshold ~ 'upregulated',
adj.P.Val < p_cutoff & logFC <= -threshold ~ 'downregulated',
adj.P.Val >= p_cutoff | abs(logFC) < threshold ~ 'not DE'
))
# Add label to the top n genes (ranked by FC)
n <- n_labels
top_genes <- res_plot |> dplyr::filter(DE != 'not DE') |>
group_by(sign(logFC)) |>
top_n(n_labels, -log10(adj.P.Val)) |>
pull(hgnc_symbol)
res_plot$genelabel <- ''
res_plot$genelabel[res_plot$hgnc_symbol %in% top_genes] <- res_plot$hgnc_symbol[res_plot$hgnc_symbol %in% top_genes]
color_map <- c("not DE" = "gray", "downregulated" = "#2166ac", "upregulated" = "#b2182b")
ggplot(res_plot, aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(aes(colour = DE), size = 1.5) +
geom_text_repel(aes(label = genelabel), min.segment.length = 0.25, force = 10) +
geom_hline(yintercept = -log10(p_cutoff), colour = '#696969', linetype = 'dashed') +
geom_vline(xintercept = -threshold, colour = '#696969', linetype = 'dashed') +
geom_vline(xintercept = threshold, colour = '#696969', linetype = 'dashed') +
scale_color_manual('', values = color_map) +
default_theme() +
theme(legend.text = element_text(size = rel(1.25))) +
labs(title = title,
x = 'log2 fold change',
y = '-log10 adjusted p-value')
}CD4 T cells
volcanos_cd4 <- list()
for(c in unique(results_cd4$Contrast)) {
data <- dplyr::filter(results_cd4, Contrast == c)
p <- VolcanoPlot(data, threshold = 1, p_cutoff = 0.05, n_labels = 15, title = c)
volcanos_cd4 <- append(volcanos_cd4, list(p))
}
wrap_plots(volcanos_cd4, guides = 'collect', axis_titles = 'collect')Warning: Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps
How to interpret the comparisons:
| Contrast | Meaning |
|---|---|
| CD4_stim_control | Effect of CD3/CD28 stimulation on CD4 control |
| CD4_stim_CeD | Effect of CD3/CD28 stimulation on CD4 CeD |
| CD4_stim_combined | Effect of CD3/CD28 stimulation on CD4 (CeD and controls combined) |
| Diff_CD4_stim | The difference between CD4 CeD stimulated and CD4 Control stimulated |
| Diff_CD4_unstim | The difference between CD4 CeD unstimulated and CD4 Control unstimulated |
The heatmap below show the expression of all these DEGs.
de_genes <- results_significant_cd4$genes |> unique()
# Choose samples to show - only CD4 T samples
samples <- y_filtered$samples |>
filter(celltype == 'CD4')
samples_to_plot <- rownames(samples)
# Get the logCPM counts for these genes
cpm_de <- cpm(y_filtered, log = TRUE)
cpm_de <- cpm_de[de_genes, samples_to_plot]
# Calculates z-score
z <- t(scale(t(cpm_de)))
# Create annotation dataframe for the heatmap - this can be whatever you want to show about the samples (columns)
annotation_col <- y_filtered$samples[colnames(z), c('stimulation', 'condition')]
ann_colors <- list(
condition = c('Control' = '#29d3d6',
'CeD' = '#910a0a'),
stimulation = c('CD3_CD28_stimulation' = '#FEE5D9',
'unstimulated' = '#FCAE91'))
# Make the heatmap with values scaled by row (genes), so you can compare between samples (columns)
pheatmap(z, cluster_rows = T, show_rownames = F, show_colnames = F, border_color = NA,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
annotation_col = annotation_col,
annotation_colors = ann_colors,
treeheight_col = FALSE)Checking enriched pathways for the CD4 comparisons
# Define the function to create the ranked gene list and run the analysis
run_gsea <- function(data, contrast, database = c('GO-BP', 'KEGG', 'Reactome')) {
# Making sure there are no duplicated entrezid codes
data <- data |> filter(Contrast == contrast) |> distinct(entrezgene_id, .keep_all = TRUE)
# Creates the ordered gene list for given contrast
foldchanges <- data$logFC
names(foldchanges) <- data$entrezgene_id
foldchanges <- sort(foldchanges, decreasing = TRUE)
# Run the analysis for given database
if(database == 'GO-BP') {
res <- gseGO(geneList = foldchanges, OrgDb = "org.Hs.eg.db", keyType = "ENTREZID", ont = "BP", pvalueCutoff = 0.05)
} else if(database == 'KEGG') {
res <- gseKEGG(geneList = foldchanges, organism = 'hsa', pvalueCutoff = 0.05)
} else if(database == 'Reactome') {
res <- gsePathway(geneList = foldchanges, organism = 'human', pvalueCutoff = 0.05)
} else {
print("Choose a database among GO-BP, KEGG or Reactome.")
}
return(res)
}
# Function to plot the results
plot_gsea <- function(data, contrast, n = 10) {
data_plot <- data[[contrast]]
data_plot <- as.data.frame(data_plot@result)
data_plot <- data_plot |>
arrange(desc(abs(NES))) |>
group_by(sign(NES)) |>
dplyr::slice(1:n)
p <- ggplot(data_plot, showCategory = n*2, aes(NES, fct_reorder(Description,NES), fill = -log10(p.adjust))) +
geom_col() +
geom_vline(xintercept = 0, colour = '#696969', linetype = 'dashed') +
scale_fill_gradientn(colours=c("#b3eebe", "#46bac2", "#371ea3"),
guide=guide_colorbar(reverse=TRUE)) +
labs(title = contrast,
x = 'Normalized Enrichment Score',
y = '') +
default_theme() +
theme(text = element_text(size = 11))
return(p)
}Apply the funcion iteratively and plot the results.
contrasts <- unique(results_significant_cd4$Contrast)
# Remove NA entrezid from the results
data_cd4 <- results_significant_cd4 |> filter(!is.na(entrezgene_id))
# GO-BP
gsea_cd4_go <- map(contrasts, run_gsea, data = data_cd4, database = 'GO-BP')using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.26% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.22% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : For some pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
names(gsea_cd4_go) <- contrasts
gsea_cd4_go_plots <- map(contrasts, plot_gsea, data = gsea_cd4_go, n = 10)
wrap_plots(gsea_cd4_go_plots, ncol = 1) + plot_annotation(title = 'GO-BP')# KEGG
gsea_cd4_kegg <- map(contrasts, run_gsea, data = data_cd4, database = 'KEGG')Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.26% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.22% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
names(gsea_cd4_kegg) <- contrasts
gsea_cd4_kegg_plots <- map(contrasts, plot_gsea, data = gsea_cd4_kegg, n = 10)
wrap_plots(gsea_cd4_kegg_plots, ncol = 1) + plot_annotation(title = 'KEGG')# ReactomePA
gsea_cd4_pa <- map(contrasts, run_gsea, data = data_cd4, database = 'Reactome')using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.26% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.22% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
names(gsea_cd4_pa) <- contrasts
gsea_cd4_pa_plots <- map(contrasts, plot_gsea, data = gsea_cd4_pa, n = 10)
wrap_plots(gsea_cd4_pa_plots, ncol = 1) + plot_annotation(title = 'Reactome')Checking overlap between DEGs in Control and CeD. Are there any CeD-specific gene or are the fold-changes different in CeD for overlapping genes?
cd4_res_control <- results_significant_cd4 |> filter(Contrast == 'CD4_stim_control' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)
cd4_res_ced <- results_significant_cd4 |> filter(Contrast == 'CD4_stim_CeD' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)
library(ggvenn)Loading required package: grid
ggvenn(list(
Control = cd4_res_control$genes,
CeD = cd4_res_ced$genes
))Inspecting the unique CeD genes
unique_ced_genes <- setdiff(cd4_res_ced$hgnc_symbol, cd4_res_control$hgnc_symbol)
print(unique_ced_genes) [1] "ARL6IP1" "FLOT2" "SARAF" "GNL2" "ITGB7" "NCL"
[7] "CCT3" "KIF22" "EBNA1BP2" "CYTIP" "LY6E" "EIF5B"
[13] "MYADM" "SLC1A5" "PHB1" "CCT8P1" "VDAC1" "PKM"
[19] "EMP3" "ATIC" "KPNB1" "DEF6" "CD3E" "PIK3R1"
[25] "SH2D2A" "PIM1" "CD96" "SRPRB" "SIT1" "TMSB4X"
[31] "LTV1" "RSL1D1" "ARHGAP4" "HJURP" "HMGB2" "ZDHHC12"
[37] "CBX6" "EED" "SLC16A1" "EMB" "TMA16" "APEX1"
[43] "CD59" "POLR3K" "CDK4" "NOB1" "NIPAL3" "KLF13"
[49] "HMGCS1" "TMSB4XP4" "CCT8" "DCTPP1" "ETV6" "TWF2"
[55] "SAMD9" "SOS1" "SPTBN1" "TFAM" "MECP2" "ATP2B1"
[61] "H1-10" "TNRC6B" "S100A11" "H1-2" "KNOP1" "BMS1"
[67] "ARHGAP18" "CAPN1" "IL27RA" "CEP57" "DOCK8" "AP1S2"
[73] "FXYD5" "GCN1" "ADA" "CKAP2" "HSPA8" "DDX1"
[79] "MYO1G" "AKAP13" "RNF167" "LDHA" "DCTN3" "NTAN1"
[85] "HMMR" "PDE3B" "TMSB4XP8" "LSP1" "UTP3" "CASP8AP2"
[91] "STAT3" "MT2A" "RRBP1" "MZT2A" "ZNF367" "EIF6"
[97] "SKAP1" "RPL7L1" "GRN" "PARP9" "CASP4" "SUSD3"
[103] "NMT2" "ANXA5" "SRF" "RPIA" "LRCH4" "CTSS"
[109] "POLR2C" "RHOC" "TGFB1" "NIN" "DGKZ" "KCTD17"
[115] "DLGAP5" "BAX" "RNF149" "DYNC2I2" "RSRP1" "JAK3"
[121] "MIEF1" "ZAP70" "H4C3" "FAM117A" "MICAL2" "ABR"
[127] "ILK" "SNHG8" "MRPL15" "DCP2" "RRP7A" "NBEAL2"
[133] "HEBP2" "MIER1" "SLC25A44" "SLC7A1" "PIK3R5" "UBE2L5"
[139] "PCBP4" "GIMAP6" "SCARNA9" "MT-TL1" "PPM1M" "RPS28"
[145] "MT-TP" "SNX14" "TIMM10" "HDAC7" "GYG1" "SYTL2"
# Heatmap with these genes
de_genes <- setdiff(cd4_res_ced$genes, cd4_res_control$genes)
# Choose samples to show - only CD4 T samples
samples <- y_filtered$samples |>
filter(celltype == 'CD4')
samples_to_plot <- rownames(samples)
# Get the logCPM counts for these genes
cpm_de <- cpm(y_filtered, log = TRUE)
cpm_de <- cpm_de[de_genes, samples_to_plot]
# Calculates z-score
z <- t(scale(t(cpm_de)))
# Create annotation dataframe for the heatmap - this can be whatever you want to show about the samples (columns)
annotation_col <- y_filtered$samples[colnames(z), c('stimulation', 'condition')]
ann_colors <- list(
condition = c('Control' = '#29d3d6',
'CeD' = '#910a0a'),
stimulation = c('CD3_CD28_stimulation' = '#FEE5D9',
'unstimulated' = '#FCAE91'))
rownames(cd4_res_ced) <- cd4_res_ced$genes
annotation_row <- cd4_res_ced[rownames(z), c('genes', 'hgnc_symbol')]
rownames(z) <- annotation_row$hgnc_symbol
# Make the heatmap with values scaled by row (genes), so you can compare between samples (columns)
pheatmap(z, cluster_rows = T, cluster_cols = T, show_rownames = T, show_colnames = F, border_color = NA,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
annotation_col = annotation_col,
annotation_colors = ann_colors,
treeheight_col = 5,
fontsize = 7
)Are these unique genes enriched for something?
unique_data <- cd4_res_ced |> filter(hgnc_symbol %in% unique_ced_genes)
unique_data <- unique_data |> filter(!is.na(entrezgene_id))
# GSEA
unique_go <- run_gsea(unique_data, contrast = 'CD4_stim_CeD', database = 'GO-BP')using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
unique_kegg <- run_gsea(unique_data, contrast = 'CD4_stim_CeD', database = 'KEGG')using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
no term enriched under specific pvalueCutoff...
unique_pa <- run_gsea(unique_data, contrast = 'CD4_stim_CeD', database = 'Reactome')using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
no term enriched under specific pvalueCutoff...
# ORA
universe <- results_cd4 |>
filter(Contrast == 'CD4_stim_CeD' & !is.na(entrezgene_id)) |> pull(genes)
ora_go <- enrichGO(
gene = unique_data$entrezgene_id,
universe = universe,
ont = "BP",
keyType = 'ENTREZID',
OrgDb = org.Hs.eg.db,
pvalueCutoff = 0.05,
readable = TRUE
)No gene sets have size between 10 and 500 ...
--> return NULL...
ora_kegg <- enrichKEGG(
gene = unique_data$entrezgene_id,
universe = universe,
pvalueCutoff = 0.05
)No gene sets have size between 10 and 500 ...
--> return NULL...
ora_pa <- enrichPathway(
gene = unique_data$entrezgene_id,
universe = universe,
pvalueCutoff = 0.05
)No gene sets have size between 10 and 500 ...
--> return NULL...
No enrichment.
CD8 T cells
volcanos_cd8 <- list()
for(c in unique(results_cd8$Contrast)) {
data <- dplyr::filter(results_cd8, Contrast == c)
p <- VolcanoPlot(data, threshold = 1, p_cutoff = 0.05, n_labels = 15, title = c)
volcanos_cd8 <- append(volcanos_cd8, list(p))
}
wrap_plots(volcanos_cd8[1:3], guides = 'collect', axis_titles = 'collect')Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
wrap_plots(volcanos_cd8[4:6], guides = 'collect', axis_titles = 'collect')Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 5 unlabeled data points (too many overlaps). Consider increasing max.overlaps
wrap_plots(volcanos_cd8[7:9], guides = 'collect', axis_titles = 'collect')Warning: Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
wrap_plots(volcanos_cd8[10:11], guides = 'collect', axis_titles = 'collect')How to interpret the comparisons:
| Contrast | Meaning |
|---|---|
| CD8_stim_combined | Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect |
| CD8_unstim_combined | Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect |
| CD8_stim_vs_unstim_combined | Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Global effect |
| CD8_stim_control | Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls |
| CD8_unstim_control | Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls |
| CD8_stim_vs_unstim_control | Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Controls |
| CD8_stim_CeD | Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD |
| CD8_unstim_CeD | Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD |
| CD8_stim_vs_unstim_CeD | Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - CeD |
| Diff_CD8_stim | What is the difference between CD8 CeD + stimulated supernatant and control? |
| Diff_CD8_unstim | What is the difference between CD8 CeD + unstimulated supernatant and control? |
Focusing the following steps on the comparisons CD8 stim vs unstim, because I believe is more informative to see what is the effect of stimulated CD4 T cells on CD8 T cells, when compared to unstimulated CD4 (insted of the basal media). What DEGs are shared and unique to CeD and control?
cd8_res_control <- results_significant_cd8 |> filter(Contrast == 'CD8_stim_vs_unstim_control' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)
cd8_res_ced <- results_significant_cd8 |> filter(Contrast == 'CD8_stim_vs_unstim_CeD' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)
ggvenn(list(
Control = cd8_res_control$genes,
CeD = cd8_res_ced$genes
))